The ´fulltext´-package includes a lightweight htmlwidget for fulltext output.
emma_txt <- janeaustenr::emma[grep("^CHAPTER\\s+.*?$", janeaustenr::emma)[1]:length(janeaustenr::emma)]
chapter_beginnings <- grep("^CHAPTER\\s+.*?$", emma_txt)
chapters <- split(
emma_txt,
cut(
1L:length(emma_txt),
c(chapter_beginnings, length(emma_txt)),
include.lowest = TRUE, right = FALSE
)
)
reconstruct_paragraphs <- function(x){
paras <- split(
x,
cut(
1L:length(x),
unique(c(1L, grep("^\\s*$", x), length(x))),
include.lowest = TRUE, right = FALSE
)
)
paras <- lapply(paras, function(x) x[x != ""])
for (i in rev(which(lapply(paras, length) == 0))) paras[[i]] <- NULL
sapply(paras, function(p) paste(p, collapse = " "))
}
chs <- lapply(chapters, reconstruct_paragraphs)
names(chs) <- sprintf("Chapter_%d", 1:length(chs))
as_paragraphdata <- function(x, name){
paras_tok <- tokenizers::tokenize_words(x, lowercase = FALSE, strip_punct = FALSE)
df_list <- lapply(
paras_tok,
function(para){
df <- data.frame(token = para, tag_before = " ", tag_after = "", stringsAsFactors = FALSE)
whitespace <- grep("^[\\.;,:!?\\)\\(]$", df[["token"]], perl = TRUE)
if (length(whitespace) > 0L) df[whitespace, "tag_before"] <- ""
if (grepl("CHAPTER", df[1,"token"])){
df[1,"tag_before"] <- sprintf("<h2 style='display:block' name='%s'>", name)
df[nrow(df), "tag_after"] <- "</h2>"
} else {
df[1,"tag_before"] <- sprintf("<para style='display:block' name='%s'>", name)
df[nrow(df), "tag_after"] <- "</para>"
}
df
}
)
do.call(rbind, df_list)
}
ftxt_list <- lapply(names(chs), function(ch) data.frame(as_paragraphdata(chs[[ch]], name = ch), chapter = ch))We introduce the fulltext package by example. In addition to the fulltext package, we need the polmineR package which includes the GERMAPARLMINI corpus.
## ... activating corpus: GERMAPARLMINI
## ... activating corpus: REUTERS
The example aims at outputting one particular speech. We take a speech held by Voker Kauder in the German Bundestag.
The data that is passed to the JavaScript that generates the output. Expected to be a list of lists that provide data on sections of text. Each of the sub-lists is to be a named list of a character vector with the HTML element the section will be wrapped into, and a data.frame (or a list) with a column “token”, and a column “id”.
library(crosstalk)
austen_chapters <- do.call(rbind, ftxt_list)
austen_chapters[["tag_before"]] <- gsub("display:block", "display:none", austen_chapters[["tag_before"]])
sd <- crosstalk::SharedData$new(austen_chapters, ~chapter, group = "fulltext")
chapters_table <- data.frame(chapter = levels(austen_chapters$chapter))
chapters_table_sd <- crosstalk::SharedData$new(chapters_table, ~chapter, group = "fulltext")
y <- bscols(
# widths = c(NA,NA),
DT::datatable(
chapters_table_sd,
options = list(lengthChange = TRUE, pageLength = 8L, pagingType = "simple", dom = "tp"),
rownames = NULL, width = "100%", selection = "single"
),
fulltext(sd, width = "100%", box = TRUE)
)Enjoy!